import pandas as pd
import numpy as np
from openai import OpenAI
from sklearn.metrics.pairwise import cosine_similarity
import os

# Load dataframe from CSV
print("Loading dataframe from CSV...")
df = pd.read_csv('misinformation_codes.csv')
print("Dataframe loaded successfully.")

# Initialize OpenAI API client
print("Initializing OpenAI API client...")
client = OpenAI(api_key=os.environ['OPENAI'])
print("OpenAI API client initialized.")


# Function to get embeddings
def get_embedding(text, model="text-embedding-ada-002"):
    text = text.replace("\n", " ")
    response = client.embeddings.create(input=[text], model=model)
    return response.data[0].embedding


# Function to check if claim is unique based on cosine similarity
def is_unique(claim_embedding, unique_embeddings, threshold):
    for unique_embedding in unique_embeddings:
        if cosine_similarity([claim_embedding],
                             [unique_embedding])[0][0] >= threshold:
            return False
    return True


# Thresholds for cosine similarity
thresholds = [0.99, 0.95, 0.90, 0.85, 0.80, 0.75, 0.70, 0.65]


# Function to process claims and filter unique ones
def filter_unique_claims(claims, threshold):
    unique_claims = []
    unique_embeddings = []

    for i, claim in enumerate(claims):
        print(
            f"Processing claim {i+1}/{len(claims)} for threshold {threshold}..."
        )
        claim_embedding = get_embedding(claim)
        if is_unique(claim_embedding, unique_embeddings, threshold):
            unique_claims.append(claim)
            unique_embeddings.append(claim_embedding)
            print(f"Claim '{claim}' is unique and added to the list.")
        else:
            print(f"Claim '{claim}' is not unique and not added to the list.")

    return unique_claims


# Extract claims from neg_rep and neg_dem columns
print("Extracting claims from 'neg_rep' and 'neg_dem' columns...")
neg_rep_claims = df["neg_rep"].tolist()
neg_dem_claims = df["neg_dem"].tolist()
print("Claims extracted successfully.")

# Filter unique claims for each threshold
print("Filtering unique claims for each threshold...")
unique_claims_neg_rep = {
    threshold: filter_unique_claims(neg_rep_claims, threshold)
    for threshold in thresholds
}
unique_claims_neg_dem = {
    threshold: filter_unique_claims(neg_dem_claims, threshold)
    for threshold in thresholds
}
print("Unique claims filtered successfully.")

# Convert to DataFrame
print("Converting unique claims to DataFrame...")
unique_claims_df = pd.DataFrame({
    "threshold":
    thresholds,
    "unique_neg_rep_claims":
    [unique_claims_neg_rep[threshold] for threshold in thresholds],
    "unique_neg_dem_claims":
    [unique_claims_neg_dem[threshold] for threshold in thresholds]
})
print("Conversion to DataFrame successful.")

# Save to CSV
print("Saving unique claims to CSV...")
unique_claims_df.to_csv('unique_claims.csv', index=False)
print("Unique claims saved to 'unique_claims.csv'")
